This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(tidyverse)
package 㤼㸱tidyverse㤼㸲 was built under R version 4.0.3Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages --------------------------------------- tidyverse 1.3.0 --
v ggplot2 3.3.3 v purrr 0.3.4
v tibble 3.0.3 v dplyr 1.0.2
v tidyr 1.1.1 v stringr 1.4.0
v readr 1.3.1 v forcats 0.5.0
package 㤼㸱ggplot2㤼㸲 was built under R version 4.0.3-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library(tidymodels)
package 㤼㸱tidymodels㤼㸲 was built under R version 4.0.3-- Attaching packages -------------------------------------- tidymodels 0.1.1 --
v broom 0.7.0 v recipes 0.1.14
v dials 0.0.9 v rsample 0.0.8
v infer 0.5.3 v tune 0.1.1
v modeldata 0.1.0 v workflows 0.2.1
v parsnip 0.1.4 v yardstick 0.0.7
package 㤼㸱dials㤼㸲 was built under R version 4.0.3package 㤼㸱modeldata㤼㸲 was built under R version 4.0.3package 㤼㸱parsnip㤼㸲 was built under R version 4.0.3package 㤼㸱recipes㤼㸲 was built under R version 4.0.3package 㤼㸱rsample㤼㸲 was built under R version 4.0.3package 㤼㸱tune㤼㸲 was built under R version 4.0.3package 㤼㸱workflows㤼㸲 was built under R version 4.0.3-- Conflicts ----------------------------------------- tidymodels_conflicts() --
x scales::discard() masks purrr::discard()
x dplyr::filter() masks stats::filter()
x recipes::fixed() masks stringr::fixed()
x dplyr::lag() masks stats::lag()
x yardstick::spec() masks readr::spec()
x recipes::step() masks stats::step()
library(GGally)
package 㤼㸱GGally㤼㸲 was built under R version 4.0.3Registered S3 method overwritten by 'GGally':
method from
+.gg ggplot2
library(reshape2)
package 㤼㸱reshape2㤼㸲 was built under R version 4.0.3
Attaching package: 㤼㸱reshape2㤼㸲
The following object is masked from 㤼㸱package:tidyr㤼㸲:
smiths
library(randomForest)
package 㤼㸱randomForest㤼㸲 was built under R version 4.0.3randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 㤼㸱randomForest㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
combine
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
margin
library(rsample)
traninglab <- read.csv("./trainingsetlabels.csv",header = TRUE,stringsAsFactors = FALSE)
traningval <-read.csv("./trainingsetvalues.csv",header = TRUE,stringsAsFactors = FALSE)
datset <- tibble(merge(traninglab, traningval,by="id"))
head(datset)
# 41 variables list as int,num or chr. 59,400 observations
str(datset)
tibble [59,400 x 41] (S3: tbl_df/tbl/data.frame)
$ id : int [1:59400] 0 1 2 3 4 5 6 7 8 9 ...
$ status_group : chr [1:59400] "non functional" "functional" "functional" "functional" ...
$ amount_tsh : num [1:59400] 0 0 0 10 0 50 0 0 0 0 ...
$ date_recorded : chr [1:59400] "2012-11-13" "2011-03-05" "2011-03-27" "2013-06-03" ...
$ funder : chr [1:59400] "Tasaf" "Shipo" "Lvia" "Germany Republi" ...
$ gps_height : int [1:59400] 0 1978 0 1639 0 28 0 0 0 0 ...
$ installer : chr [1:59400] "TASAF" "SHIPO" "LVIA" "CES" ...
$ longitude : num [1:59400] 33.1 34.8 36.1 37.1 36.2 ...
$ latitude : num [1:59400] -5.12 -9.4 -6.28 -3.19 -6.1 ...
$ wpt_name : chr [1:59400] "Mratibu" "none" "Bombani" "Area 7 Namba 5" ...
$ num_private : int [1:59400] 0 0 0 0 0 0 0 0 0 0 ...
$ basin : chr [1:59400] "Lake Tanganyika" "Rufiji" "Wami / Ruvu" "Pangani" ...
$ subvillage : chr [1:59400] "Majengo" "Magoda C" "Songambele" "Urereni" ...
$ region : chr [1:59400] "Tabora" "Iringa" "Dodoma" "Kilimanjaro" ...
$ region_code : int [1:59400] 14 11 1 3 1 60 17 1 1 18 ...
$ district_code : int [1:59400] 3 4 4 5 4 43 3 1 5 8 ...
$ lga : chr [1:59400] "Uyui" "Njombe" "Chamwino" "Hai" ...
$ ward : chr [1:59400] "Igalula" "Uwemba" "Msamalo" "Masama Magharibi" ...
$ population : int [1:59400] 0 20 0 25 0 6922 0 0 0 0 ...
$ public_meeting : chr [1:59400] "" "True" "True" "True" ...
$ recorded_by : chr [1:59400] "GeoData Consultants Ltd" "GeoData Consultants Ltd" "GeoData Consultants Ltd" "GeoData Consultants Ltd" ...
$ scheme_management : chr [1:59400] "VWC" "" "VWC" "Water Board" ...
$ scheme_name : chr [1:59400] "" "" "Mgun" "Losaa-Kia water supply" ...
$ permit : chr [1:59400] "True" "False" "True" "True" ...
$ construction_year : int [1:59400] 0 2008 0 1999 0 0 0 0 0 0 ...
$ extraction_type : chr [1:59400] "afridev" "other - rope pump" "mono" "gravity" ...
$ extraction_type_group: chr [1:59400] "afridev" "rope pump" "mono" "gravity" ...
$ extraction_type_class: chr [1:59400] "handpump" "rope pump" "motorpump" "gravity" ...
$ management : chr [1:59400] "vwc" "vwc" "vwc" "water board" ...
$ management_group : chr [1:59400] "user-group" "user-group" "user-group" "user-group" ...
$ payment : chr [1:59400] "unknown" "never pay" "pay per bucket" "pay per bucket" ...
$ payment_type : chr [1:59400] "unknown" "never pay" "per bucket" "per bucket" ...
$ water_quality : chr [1:59400] "milky" "soft" "soft" "soft" ...
$ quality_group : chr [1:59400] "milky" "good" "good" "good" ...
$ quantity : chr [1:59400] "enough" "enough" "insufficient" "enough" ...
$ quantity_group : chr [1:59400] "enough" "enough" "insufficient" "enough" ...
$ source : chr [1:59400] "shallow well" "shallow well" "machine dbh" "spring" ...
$ source_type : chr [1:59400] "shallow well" "shallow well" "borehole" "spring" ...
$ source_class : chr [1:59400] "groundwater" "groundwater" "groundwater" "groundwater" ...
$ waterpoint_type : chr [1:59400] "hand pump" "hand pump" "communal standpipe multiple" "communal standpipe" ...
$ waterpoint_type_group: chr [1:59400] "hand pump" "hand pump" "communal standpipe" "communal standpipe" ...
# no blank observations
length(unique(datset$id)) - sum(complete.cases(datset))
[1] 0
# 4609 blank cells in datset
sum(datset=="")
[1] 46094
# replace blanks with NAs
datset[,][datset[,]==""] <-NA
# count the number NAs in datset (46094)
sum(is.na(datset))
[1] 46094
# percent of NAs by number of cells 1.892667
46094/(41*59400) *100
[1] 1.892667
# list the variables that the NAs fall under
map(datset,~sum(is.na(.)))
$id
[1] 0
$status_group
[1] 0
$amount_tsh
[1] 0
$date_recorded
[1] 0
$funder
[1] 3635
$gps_height
[1] 0
$installer
[1] 3655
$longitude
[1] 0
$latitude
[1] 0
$wpt_name
[1] 0
$num_private
[1] 0
$basin
[1] 0
$subvillage
[1] 371
$region
[1] 0
$region_code
[1] 0
$district_code
[1] 0
$lga
[1] 0
$ward
[1] 0
$population
[1] 0
$public_meeting
[1] 3334
$recorded_by
[1] 0
$scheme_management
[1] 3877
$scheme_name
[1] 28166
$permit
[1] 3056
$construction_year
[1] 0
$extraction_type
[1] 0
$extraction_type_group
[1] 0
$extraction_type_class
[1] 0
$management
[1] 0
$management_group
[1] 0
$payment
[1] 0
$payment_type
[1] 0
$water_quality
[1] 0
$quality_group
[1] 0
$quantity
[1] 0
$quantity_group
[1] 0
$source
[1] 0
$source_type
[1] 0
$source_class
[1] 0
$waterpoint_type
[1] 0
$waterpoint_type_group
[1] 0
unique<-datset%>%
map(., ~str_c(length(unique(.x)),collapse=',')) %>%
bind_rows() %>%
gather(key = col_name, value= col_unique)
# large percentage of NAs in $scheme_name (~47.4%)
28166/59400 *100
[1] 47.41751
# datset%>%
# mutate(funder=replace_na(funder,"other"))%>%
# map(., ~str_c(length(unique(.x)),collapse=',')) %>%
# bind_rows() %>%
# gather(key = col_name, value= col_unique)
# plot status_group by count
datset %>%
ggplot(., aes(x=status_group)) +
geom_histogram(stat = "count",aes(fill=status_group))
Ignoring unknown parameters: binwidth, bins, pad
Columns with given number of NAs in each column $funder [1] 3635 $installer [1] 3655 $subvillage [1] 371 $public_meeting [1] 3334 $scheme_name [1] 28166 $scheme_management [1] 3877 $permit [1] 3056
# plot status_group by count
datset %>%
ggplot(., aes(x=status_group)) +
geom_histogram(stat = "count",aes(fill=status_group))
Ignoring unknown parameters: binwidth, bins, pad
datset%>%
group_by(payment_type) %>%
summarise(status_group)%>%
ggplot( aes(x=payment_type)) +
geom_histogram(stat="count", aes(fill=status_group))+
theme(axis.text.x = element_text(angle=90))
`summarise()` regrouping output by 'payment_type' (override with `.groups` argument)
Ignoring unknown parameters: binwidth, bins, pad
datset%>%
group_by(basin) %>%
summarise(status_group)%>%
ggplot( aes(x=basin)) +
geom_histogram(stat="count", aes(fill=status_group))+
theme(axis.text.x = element_text(angle=90))
`summarise()` regrouping output by 'basin' (override with `.groups` argument)
Ignoring unknown parameters: binwidth, bins, pad
datset%>%
group_by(basin)%>%
ggplot(aes(x=basin))+
geom_point(aes(y= amount_tsh,color=status_group))+
theme(axis.text.x = element_text(angle=90))
datset%>%
group_by(basin)%>%
ggplot(aes(x=basin))+
geom_point(aes(y= gps_height,color=status_group))+
theme(axis.text.x = element_text(angle=90))
NA
# Shows duplicate data with slightly different labels, quality_group being a subset of water_quality
datset%>%
select(water_quality,quality_group)%>%
table()
quality_group
water_quality colored fluoride good milky salty unknown
coloured 490 0 0 0 0 0
fluoride 0 200 0 0 0 0
fluoride abandoned 0 17 0 0 0 0
milky 0 0 0 804 0 0
salty 0 0 0 0 4856 0
salty abandoned 0 0 0 0 339 0
soft 0 0 50818 0 0 0
unknown 0 0 0 0 0 1876
# Source is most granular with sources_type and source_class being subsets
datset%>%
select(source,source_type)%>%
table()
source_type
source borehole dam other rainwater harvesting river/lake shallow well spring
dam 0 656 0 0 0 0 0
hand dtw 874 0 0 0 0 0 0
lake 0 0 0 0 765 0 0
machine dbh 11075 0 0 0 0 0 0
other 0 0 212 0 0 0 0
rainwater harvesting 0 0 0 2295 0 0 0
river 0 0 0 0 9612 0 0
shallow well 0 0 0 0 0 16824 0
spring 0 0 0 0 0 0 17021
unknown 0 0 66 0 0 0 0
datset%>%
select(source,source_class)%>%
table()
source_class
source groundwater surface unknown
dam 0 656 0
hand dtw 874 0 0
lake 0 765 0
machine dbh 11075 0 0
other 0 0 212
rainwater harvesting 0 2295 0
river 0 9612 0
shallow well 16824 0 0
spring 17021 0 0
unknown 0 0 66
datset %>%
select(source_type,source_class) %>%
table ()
source_class
source_type groundwater surface unknown
borehole 11949 0 0
dam 0 656 0
other 0 0 278
rainwater harvesting 0 2295 0
river/lake 0 10377 0
shallow well 16824 0 0
spring 17021 0 0
# remove the variables that are duplicate, subsets, or not used in analysis
datdf <- datset %>%
select(-c(id,recorded_by,scheme_name,num_private,extraction_type_group,
extraction_type_class,payment,region_code,district_code,source_type,
source_class,waterpoint_type_group,quality_group,lga,ward,latitude,longitude,
construction_year,date_recorded,subvillage))
# replace all NA's with unknown (note: other was used in different variables)
datdf<- datdf %>%
replace(is.na(.),"unknown")
# Change data type to specific predictors
datdf <- transform(
datdf,
status_group = as.factor(status_group),
basin = as.factor(basin),
public_meeting = as.factor(public_meeting))
str(datdf)
'data.frame': 59400 obs. of 21 variables:
$ status_group : Factor w/ 3 levels "functional","functional needs repair",..: 3 1 1 1 3 1 3 1 3 3 ...
$ amount_tsh : num 0 0 0 10 0 50 0 0 0 0 ...
$ funder : chr "Tasaf" "Shipo" "Lvia" "Germany Republi" ...
$ gps_height : int 0 1978 0 1639 0 28 0 0 0 0 ...
$ installer : chr "TASAF" "SHIPO" "LVIA" "CES" ...
$ wpt_name : chr "Mratibu" "none" "Bombani" "Area 7 Namba 5" ...
$ basin : Factor w/ 9 levels "Internal","Lake Nyasa",..: 4 7 9 6 9 9 1 7 9 5 ...
$ region : chr "Tabora" "Iringa" "Dodoma" "Kilimanjaro" ...
$ population : int 0 20 0 25 0 6922 0 0 0 0 ...
$ public_meeting : Factor w/ 3 levels "False","True",..: 3 2 2 2 2 2 2 2 2 2 ...
$ scheme_management: chr "VWC" "unknown" "VWC" "Water Board" ...
$ permit : chr "True" "False" "True" "True" ...
$ extraction_type : chr "afridev" "other - rope pump" "mono" "gravity" ...
$ management : chr "vwc" "vwc" "vwc" "water board" ...
$ management_group : chr "user-group" "user-group" "user-group" "user-group" ...
$ payment_type : chr "unknown" "never pay" "per bucket" "per bucket" ...
$ water_quality : chr "milky" "soft" "soft" "soft" ...
$ quantity : chr "enough" "enough" "insufficient" "enough" ...
$ quantity_group : chr "enough" "enough" "insufficient" "enough" ...
$ source : chr "shallow well" "shallow well" "machine dbh" "spring" ...
$ waterpoint_type : chr "hand pump" "hand pump" "communal standpipe multiple" "communal standpipe" ...
split <- initial_split(datdf)
train <- training(split)
test <- testing(split)
rf_engine <- rand_forest(
mode = 'classification', mtry = NULL, trees= NULL, min_n= NULL) %>%
set_engine('randomForest')
rf_workflow <- workflow() %>%
add_model(rf_engine) %>%
add_formula(status_group~.)
results<-rf_workflow %>%
fit(train) %>%
predict(test) %>%
bind_cols(test)%>%
conf_mat(estimate=.pred_class,truth=status_group)
results
Truth
Prediction functional functional needs repair non functional
functional 7410 771 1736
functional needs repair 62 134 30
non functional 591 150 3966
tune_spec <-rand_forest(
mode = 'classification',mtry= tune(),trees= NULL, min_n= tune()) %>%
set_engine('randomForest')
tune_wf <- workflow() %>%
add_model(tune_spec)%>%
add_formula(status_group~.)
folds <- vfold_cv(train, v=5)
folds
# 5-fold cross-validation